Region <- read_csv("~/Documents/Data visualisation/Region.csv")
Case <- read_csv("~/Documents/Data visualisation/Case.csv")
PatientInfo <- read_csv("~/Documents/Data visualisation/PatientInfo.csv")
Policy <- read_csv("~/Documents/Data visualisation/Policy.csv")
SearchTrend <- read_csv("~/Documents/Data visualisation/SearchTrend.csv")
SeoulFloating <- read_csv("~/Documents/Data visualisation/SeoulFloating.csv")
Time <- read_csv("~/Documents/Data visualisation/Time.csv")
TimeAge <- read_csv("~/Documents/Data visualisation/TimeAge.csv")
TimeGender <- read_csv("~/Documents/Data visualisation/TimeGender.csv")
TimeProvince <- read_csv("~/Documents/Data visualisation/TimeProvince.csv")
Weather <- read_csv("~/Documents/Data visualisation/Weather.csv")

1 COVID-19 on The Map

# Filling in missing values of longitude and latitude based on the names of 
# infection_case

case_1 <- Case %>% group_by(infection_case) %>% select(province, latitude, 
                                                       longitude, confirmed)

case_1$latitude <- ifelse(case_1$infection_case == "Coupang Logistics Center", 
                          37.233581, case_1$latitude)  
case_1$longitude <- ifelse(case_1$infection_case == "Coupang Logistics Center", 
                           127.360893, case_1$longitude) 

case_1$latitude <- ifelse(case_1$infection_case == "Gangnam Yeoksam-dong 
                          gathering", 37.500259, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Gangnam Yeoksam-dong 
                           gathering", 127.038737, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Geumcheon-gu rice milling 
                          machine manufacture", 37.460568, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Geumcheon-gu rice milling 
                           machine manufacture", 126.900830, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Sincheonji Church", 
                          35.84008, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Sincheonji Church", 
                           128.5667, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Seocho Family", 37.515324, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Seocho Family", 127.014225, 
                           case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Orange Life", 37.503305, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Orange Life", 127.045828, 
                           case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Shincheonji Church", 
                          35.84008, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Shincheonji Church", 
                           128.5667, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Itaewon Clubs", 37.534930, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Itaewon Clubs", 126.994520, 
                           case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Korea Campus Crusade of 
                          Christ", 37.594852, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Korea Campus Crusade of 
                           Christ",126.967926, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Geumcheon-gu rice milling 
                          machine manufacture", 37.460613, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Geumcheon-gu rice milling 
                           machine manufacture", 126.901118, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Seoul City Hall Station 
                          safety worker", 37.565701, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Seoul City Hall Station 
                           safety worker", 126.976864, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Uijeongbu St. Mary’s 
                          Hospital", 37.758598, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Uijeongbu St. Mary’s 
                           Hospital", 127.077668, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Eunpyeong-Boksagol culture 
                          center", 37.603601, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Eunpyeong-Boksagol culture 
                           center", 126.927424, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Dongan Church", 37.593832, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Dongan Church", 127.057759, 
                           case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Eunpyeong St. Mary's 
                          Hospital", 37.633300, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Eunpyeong St. Mary's 
                           Hospital", 126.917613, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Jongno Community Center", 
                          37.574463, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Jongno Community Center", 
                           126.964505, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Samsung Medical Center", 
                          37.487436, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Samsung Medical Center", 
                           127.084983, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "KB Life Insurance", 
                          37.524524, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "KB Life Insurance", 
                           126.924621, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Samsung Fire & Marine 
                          Insurance", 37.569246, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Samsung Fire & Marine 
                           Insurance", 127.061769, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Haeundae-gu Catholic 
                          Church", 35.163898, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Haeundae-gu Catholic 
                           Church", 129.164522, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Fatima Hospital", 35.883999, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Fatima Hospital", 
                           128.623859, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Dunsan Electronics Town", 
                          36.339820, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Dunsan Electronics Town", 
                           127.394966, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Ministry of Oceans and 
                          Fisheries", 36.504487, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Ministry of Oceans and 
                           Fisheries", 127.265410, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Bundang Jesaeng Hospital", 
                          37.388157, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Bundang Jesaeng Hospital", 
                           127.121753, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Goesan-gun Jangyeon-myeon", 
                          36.835064, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Goesan-gun Jangyeon-myeon", 
                           127.934339, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Hanmaeum Changwon Hospital", 
                          35.221051, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Hanmaeum Changwon Hospital", 
                           128.686697, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Yangcheon Table Tennis 
                          Club", 37.545121, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Yangcheon Table Tennis 
                           Club", 126.862415, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Guro-gu Call Center", 
                          37.482466, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Guro-gu Call Center", 
                           126.889889, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Manmin Central Church", 
                          37.275929, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Manmin Central Church", 
                           127.448461, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Wangsung Church", 37.481782, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Wangsung Church", 
                           126.929964, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Jung-gu Fashion Company", 
                          37.569287, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Jung-gu Fashion Company", 
                           127.008984, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Yeongdeungpo Learning 
                          Institute", 37.513190, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Yeongdeungpo Learning 
                           Institute", 126.914717, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Uiwang Logistics Centere", 
                          37.362416, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Uiwang Logistics Center", 
                           126.989844, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Jin-gu Academy", 35.822701, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Jin-gu Academy", 
                           128.639709, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Cheongdo Daenam Hospital", 
                          35.647365, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Cheongdo Daenam Hospital", 
                           128.733907, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Orange Town", 36.339701, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Orange Town", 127.390509, 
                           case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Lord Glory Church", 
                          37.491842, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Lord Glory Church", 
                           126.983049, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Suwon Saeng Myeong Saem 
                          Church", 37.563267, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Suwon Saeng Myeong Saem 
                           Church", 126.987304, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Gyeongsan Jeil Silver Town", 
                          35.833864, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Gyeongsan Jeil Silver 
                           Town", 128.809070, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Gyeongsan Cham Joeun 
                          Community Center", 35.832055, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Gyeongsan Cham Joeun 
                           Community Center", 128.753115, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Geochang-gun 
                          Woongyang-myeon", 35.832104, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Geochang-gun 
                           Woongyang-myeon", 127.924074, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Changnyeong Coin Karaoke", 
                          35.800795, case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Changnyeong Coin Karaoke", 
                           128.491157, case_1$longitude)

case_1$latitude <- ifelse(case_1$infection_case == "Soso Seowon", 36.924874, 
                          case_1$latitude)
case_1$longitude <- ifelse(case_1$infection_case == "Soso Seowon", 128.580175, 
                           case_1$longitude)

case_1
## # A tibble: 174 × 5
## # Groups:   infection_case [81]
##    infection_case                   province latitude  longitude  confirmed
##    <chr>                            <chr>    <chr>     <chr>          <dbl>
##  1 Itaewon Clubs                    Seoul    37.53493  126.99452        139
##  2 Richway                          Seoul    37.48208  126.901384       119
##  3 Guro-gu Call Center              Seoul    37.482466 126.889889        95
##  4 Yangcheon Table Tennis Club      Seoul    37.546061 126.874209        43
##  5 Day Care Center                  Seoul    37.679422 127.044374        43
##  6 Manmin Central Church            Seoul    37.275929 127.448461        41
##  7 SMR Newly Planted Churches Group Seoul    -         -                 36
##  8 Dongan Church                    Seoul    37.593832 127.057759        17
##  9 Coupang Logistics Center         Seoul    37.233581 127.360893        25
## 10 Wangsung Church                  Seoul    37.481782 126.929964        30
## # ℹ 164 more rows
# Covert data type of latitude and longitude
case_1$latitude <- as.numeric(case_1$latitude)
case_1$longitude <- as.numeric(case_1$longitude)

# Remove missing data points
case_1 <- na.omit(case_1)

# Label cases on the map of South Korea
map <- case_1 %>%
  leaflet() %>%
  addProviderTiles(providers$Esri) %>% 
  addCircleMarkers(lng = ~ longitude, 
                   lat = ~ latitude, 
                   radius = ~ 4, 
                   fillOpacity = 0.6, 
                   popup = ~ confirmed,
                   label = ~ infection_case,
                   fill = ~ confirmed,
                   stroke = FALSE
                   ) 
map
# Calculate percentages for each province
case_2 <- Case %>% 
  group_by(province) %>% 
  summarise(sum = sum(confirmed)) %>%
  ungroup() %>% 
  mutate(percentage = sum / sum(sum) * 100)

# Create a bar chart to visualize the percentage
case_2 <- case_2 %>%
  mutate(bar_color = ifelse(percentage == max(percentage), "dodgerblue2", "blue4"))

ggplot(case_2, aes(x = reorder(province, percentage), y = percentage, fill = bar_color)) +
  geom_bar(stat = "identity") +
  scale_fill_identity() +
  xlab("Province") +
  ylab(NULL) +
  geom_text(
    aes(label = sprintf("%.1f%%", percentage)), 
    position = position_nudge(y = 5), 
    vjust = 0, 
    size = 3,  
    color = "black",  
    show.legend = FALSE
  ) +
  coord_flip() +
  theme_minimal() +
  ggtitle("Daegu Leads in COVID-19 Cases Among Korean Provinces") +
  labs(subtitle = "Percentage of Confirmed Cases in Each of the 17 Korean Provinces") +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(size = 13, face = "bold", hjust = -1.78),
    plot.subtitle = element_text(size = 11, hjust = -1.07),
    legend.position = "none"
  ) +
  scale_y_continuous(labels = percent_format(scale = 1))

2 Impact of Demographic Factors on COVID-19

# Visualization 1: Impact of Demographic Factors on Case number
Patient_analysis <- PatientInfo %>%
  group_by(age, sex) %>%
  summarise(count = n()) %>% 
  filter(!is.na(age)& !is.na(sex) & age != "100s")

ggplot(Patient_analysis, aes(x = age, y = count, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Dominant COVID-19 Incidence in Women, Peaking Among Those in Their 20s", 
       subtitle = "Age and Gender Distribution of COVID-19 Cases in South Korea", 
       x = "Age",
       y = "Number of COVID cases") +
  theme(
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  )+
  scale_fill_manual(values = c("blue4", "dodgerblue2")) 

# Visualization 2: Impact of Demographic Factors on Mortality 

Patient_analysis2 <- PatientInfo %>%
  filter(state =="deceased") %>%
  group_by(age, sex) %>%
  summarise(count = n()) %>% 
  filter(!is.na(age)& !is.na(sex))

ggplot(Patient_analysis2, aes(x = age, y = count, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Highest COVID-19 Mortality Observed in Women Aged 80 and Above", 
       subtitle = "Age and Gender Distribution of COVID-19 Mortality in South Korea", 
       x = "Age",
       y = "Number of People Deceased from COVID-19") +
  theme(
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) +
  scale_fill_manual(values = c("blue4", "dodgerblue2")) 

# Visualization 3: Gender-wise Case Distribution Over Time
TimeGender_cleaned <- TimeGender %>%
  group_by(sex) %>%
  mutate(
    new_confirmed = ifelse(row_number() == 1, confirmed, confirmed - lag(confirmed)),
    new_deceased = ifelse(row_number() == 1, deceased, deceased - lag(deceased)),
    date = as.Date(date, format="%Y-%m-%d")  # Ensure date is in Date format
  ) %>%
  filter(date != "2020-03-02")

ggplot(data = TimeGender_cleaned, aes(x = as.Date(date), y = new_confirmed, colour = sex)) +
  geom_smooth() +
  scale_color_manual(values = c("blue4", "deepskyblue1")) +
  theme_minimal() +
  labs(
    title = "Higher COVID-19 Case Rates Among Women in Early Stages of the Pandemic",
    subtitle = "Gender-wise COVID-19 Case Distribution Over Time, Data from 20/01/2020 to 30/06/2020",
    y = "Number of COVID-cases"
  ) + 
  xlab(NULL) +
  theme(
    panel.grid.major = element_blank(), 
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) 

ggplot(data = TimeGender_cleaned, aes(x = as.Date(date), y = new_deceased, colour = sex)) +
  geom_smooth() +
  scale_color_manual(values = c("blue4", "deepskyblue1")) +
  theme_minimal() +
  labs(
    title = "Equal COVID-19 Fatality Rates Among Men and Women",
    subtitle = "Gender-wise COVID-19 Mortality Distribution Over Time, Data from 20/01/2020 to 30/06/2020",
    y = "Number of People Deceased due to COVID-19"
  ) + 
  xlab(NULL) +
  theme(
    panel.grid.major = element_blank(), 
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) 

# Visualisation 4: Region contributing factors to Mortality

## Summarize confirmed and deceased cases by province

# Calculate new cases, new tests, and new deceased
summary_timeProvince <- TimeProvince %>%
  filter(date=="2020-06-30")

## Join with the Region dataset
combined_data <- Region %>%
  inner_join(summary_timeProvince, by = "province")

## Selecting relevant columns for correlation
correlation_data <- combined_data %>%
  select(elementary_school_count, kindergarten_count, university_count,
         academy_ratio, elderly_population_ratio, elderly_alone_ratio,
         nursing_home_count, confirmed, released, deceased)

## Compute correlation matrix
correlation_matrix <- cor(correlation_data, use = "complete.obs")
  1. Low Correlation with School Counts: Elementary, kindergarten, and university counts have very low correlations with both confirmed and deceased cases. This suggests that the number of these educational institutions in a region did not significantly impact COVID-19 spread or mortality.

  2. Moderate Correlation with Academy Ratio: The academy ratio shows a moderate positive correlation with confirmed cases. This might indicate that regions with a higher concentration of academies had slightly higher infection rates.

  3. Negative Correlation with Elderly Population Ratios: Interestingly, both elderly population ratio and elderly alone ratio show a small negative correlation with confirmed and deceased cases. This could suggest that regions with higher elderly populations didn’t necessarily experience more severe outbreaks, contrary to what might be expected.

  4. Positive Correlation with Nursing Home Count: There’s a positive correlation with nursing home count, although it’s not very strong. This might indicate that regions with more nursing homes had slightly higher confirmed and deceased cases, which aligns with global trends where nursing homes were often hotspots.

## Academic ratio correlation 
correlation_academic_ratio <- cor(combined_data$academy_ratio, combined_data$confirmed)

ggplot(combined_data, aes(x = academy_ratio, y = confirmed)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  theme_minimal() +
  labs(title = paste("Positive correlation between Academy Ratio and COVID-19 Cases (", round(correlation_academic_ratio, 2), ")"),
       x = "Academy Ratio",
       y = "Total Confirmed Cases") + 
    theme(
    plot.title = element_text(size = 12, face = "bold")
  )

## Elderly population correlation 

correlation_elderly_pop <- cor(combined_data$elderly_population_ratio, combined_data$confirmed)

ggplot(combined_data, aes(x = elderly_population_ratio, y = confirmed)) +
  geom_point() +
  geom_smooth(method = "lm", color = "red") +
  theme_minimal() +
  labs(title = "Elderly Population Ratio vs. Total Confirmed COVID-19 Cases",
       subtitle = paste("Negative correlation (", round(correlation_elderly_pop, 2), ") challenges expectations"),
       x = "Elderly Population Ratio",
       y = "Total Confirmed Cases") + 
    theme(
    plot.title = element_text(size = 12, face = "bold")
  )

correlation_elderly_pop_deceased <- cor(combined_data$elderly_population_ratio, combined_data$deceased)

ggplot(combined_data, aes(x = elderly_population_ratio, y = deceased)) +
  geom_point() +
  geom_smooth(method = "lm", color = "red") +
  theme_minimal() +
  labs(title = "Elderly Population Ratio vs. COVID-19 Mortality",
       subtitle = paste("Negative correlation (", round(correlation_elderly_pop_deceased, 2), ") challenges expectations"),
       x = "Elderly Population Ratio",
       y = "Amount of People Deceased due to COVID-19") + 
    theme(
    plot.title = element_text(size = 12, face = "bold")
  )

## Nursing home correlation, without two extreme values

threshold_value <- 4000  
filtered_data <- combined_data %>%
  filter(nursing_home_count <= threshold_value)

correlation_nursing_home <- cor(filtered_data$nursing_home_count, filtered_data$confirmed)

ggplot(filtered_data, aes(x = nursing_home_count, y = confirmed)) +
  geom_point() +
  geom_smooth(method = "lm", color = "green") +
  theme_minimal() +
  labs(title = paste("Positive correlation between Nursing Home Count and COVID-19 Cases (",round(correlation_nursing_home, 2),")"),
       x = "Nursing Home Count",
       y = "Total Confirmed Cases")+ 
  theme(
  plot.title = element_text(size = 12, face = "bold")
  )

correlation_nursing_home_deceased <- cor(filtered_data$nursing_home_count, filtered_data$deceased)

ggplot(filtered_data, aes(x = nursing_home_count, y = deceased)) +
  geom_point() +
  geom_smooth(method = "lm", color = "green") +
  theme_minimal() +
  labs(title = paste("No correlation between Nursing Home Count and COVID-19 Cases"),
       x = "Nursing Home Count",
       y = "Total Confirmed Cases")+ 
  theme(
  plot.title = element_text(size = 12, face = "bold")
  )

# Policies during COVID - 19

# Convert date columns to Date format
Policy$start_date <- as.Date(Policy$start_date)
Policy$end_date <- as.Date(Policy$end_date)

# Filter out policies with missing end dates
Policy <- Policy[!is.na(Policy$end_date), ]
# Bar plot of policy types 
#The below graph shows the distribution of policy type by graphs depicting the education policies to have run for the highest number of times

Policy_summary <- Policy %>%
  group_by(type) %>%
  summarize(Count = n()) %>%
  arrange(desc(Count))

# Create the bar chart with ordered bars
ggplot(Policy_summary, aes(x = reorder(type, -Count), y = Count)) +
  geom_bar(stat = "identity", fill = "blue4") +
  labs(title = "COVID-19 Policies During the First Half of 2020 Focus on Education",
       subtitle = "Examining Policy Types from 20/01/2020 to 30/06/2020",
       x = "Policy Types") +
  ylab(NULL) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(), 
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  )

# Timeline plot of policies 
#The below graph shows what policy was introduced during what stage of the pandemic, understanding the social dymanic at each stage

ggplot(Policy, aes(x = start_date, y = fct_reorder(type, start_date), label = type)) +
  geom_point(color = "blue4") +
  geom_text_repel() +
  labs(title = "A Shift Towards Administrative, Transformation and Health Policies in the 2nd Quarter of 2020", subtitle = "Timeline of Government Policies from 20/01/2020 to 30/06/2020",
       x = "Start Date", y = "Policy Types") +
  theme_minimal() + 
  theme(
    panel.grid.major = element_blank(), 
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold", hjust = 0.43),
    plot.subtitle = element_text(size = 11, hjust = -0.5)
  ) 

3 Policy Impact on COVID - 19

Reading and cleaning the policy data.

# Replacing NA in 'end_date' with a default date and converting to Date type
policy_data_cleaned <- Policy %>%
  mutate(end_date = ifelse(is.na(end_date), "2020-06-30", as.character(end_date)),
         end_date = as.Date(end_date, format="%Y-%m-%d"))

Reading and Cleaning Time-Series data.

# Calculating new cases, tests, and deceased, ensuring 'date' is in Date format
time_data_cleaned <- Time %>%
  mutate(
    new_tests = ifelse(row_number() == 1, test, test - lag(test)),
    new_confirmed = ifelse(row_number() == 1, confirmed, confirmed - lag(confirmed)),
    new_deceased = ifelse(row_number() == 1, deceased, deceased - lag(deceased)),
    date = as.Date(date, format="%Y-%m-%d")  
  )

Defining a function to calculate the number of active policies on a given date.

active_policies_on_date <- function(date, policies) {
  active_policies <- policies %>%
    filter(start_date <= date & end_date >= date)
  return(nrow(active_policies))
}

# Applying the function to get the active policy count for each date
time_data_cleaned <- time_data_cleaned %>%
  rowwise() %>%
  mutate(active_policy_count = active_policies_on_date(date, policy_data_cleaned))

Now we make a plot which shows the number of new confirmed cases and active policies over time to see if there is any relationship. We also highlight the period where key policies including the first strong Social Distancing campaign, and the drive through screening centers.

# Highlighting a specific period in the data
time_data_cleaned$highlight <- ifelse(time_data_cleaned$date >= as.Date("2020-02-23") & 
                                      time_data_cleaned$date <= as.Date("2020-02-29"), 
                                      "Key Policy Period", NA)

# Plotting confirmed cases and active policies over time with dynamic y-axis limits
ratio <- 20
max_confirmed <- max(time_data_cleaned$new_confirmed, na.rm = TRUE)
upper_limit <- ceiling(max_confirmed / 200) * 200  

# Plotting the data with ggplot2
ggplot(time_data_cleaned, aes(x = date)) +
  geom_rect(data = subset(time_data_cleaned, !is.na(highlight)), 
            aes(xmin = as.Date("2020-02-23"), xmax = as.Date("2020-02-29"), 
                ymin = -Inf, ymax = Inf, fill = highlight), alpha = 0.9) +
  geom_line(aes(y = new_confirmed, colour = "Daily Confirmed Cases"),size=1) +
  geom_line(aes(y = active_policy_count * ratio, colour = "Number of Active Policies"),size=0.9) +
  scale_fill_manual(name = "Highlight", 
                    values = c("Key Policy Period" = "yellow"), 
                    labels = c("Key Policy Period" = "Introduction of Key Policies")) +
  guides(fill = guide_legend(override.aes = list(alpha = 0.7))) +
  scale_colour_manual("", 
                      breaks = c("Daily Confirmed Cases", "Number of Active Policies"),
                      values = c("brown1", "blue4")) +
  scale_y_continuous("Daily Confirmed Cases", 
                     sec.axis = sec_axis(~ . / ratio, name = "Number of Active Policies"),
                     limits = c(0, upper_limit),  # Dynamically set the upper limit
                     breaks = seq(0, upper_limit, by = 200)) +  # Set breaks to match the adjusted limits
  labs(title = "Daily Confirmed COVID-19 Cases and Number of Active Policies Over Time",
       subtitle = 'Data from 20/01/2020 to 30/06/2020',
       x = "Date", y = "Daily Confirmed Cases") +
  theme_minimal() +
  theme(
    axis.title.y = element_text(color = "brown1"),
    axis.text.y = element_text(color = "brown1"),
    axis.title.y.right = element_text(color = "blue4"),
    axis.text.y.right = element_text(color = "blue4"),
    legend.position = "bottom",
    legend.title = element_blank(), # Remove the title of the legend if not needed
    panel.grid.major.y = element_line(),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.background = element_blank(), # Remove vertical gridlines
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.background = element_blank(), # Remove legend background
    legend.key = element_blank(),  # Remove legend key boxes
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) 

The yellow highlighted period shows the period of introduction of some key policies, including social distancing, Infectious Disease alert level 4, and drive through screening centers. We can

# Calculate the scaling factor for the secondary axis
# This will scale the active policies down to fit the range of new_deceased
# Define the maximum values for new_deceased and active_policy_count
max_deceased <- max(time_data_cleaned$new_deceased, na.rm = TRUE)
max_policy_count <- max(time_data_cleaned$active_policy_count, na.rm = TRUE)

# Calculate the scaling factor for the secondary axis
#scaling_factor <- max_deceased / max_policy_count
scaling_factor <- 0.2
# Plotting the data with ggplot2
ggplot(time_data_cleaned, aes(x = date)) +
  # Draw the yellow highlight for the key policy period
  geom_rect(data = subset(time_data_cleaned, !is.na(highlight)), 
            aes(xmin = as.Date("2020-02-23"), xmax = as.Date("2020-02-29"), 
                ymin = -Inf, ymax = Inf, fill = highlight), alpha = 0.1) +
  # Add a line for the new_deceased
  geom_line(aes(y = new_deceased, colour = "New Deceased"),size=0.9) +
  # Add a line for the active_policy_count, scaled to fit the graph
  geom_line(aes(y = active_policy_count * scaling_factor, colour = "Number of Active Policies"),size=0.9) +
  # Define the fill color for the highlight and make the legend key solid
  scale_fill_manual(name = "Highlight", 
                    values = c("Key Policy Period" = "yellow"), 
                    labels = c("Key Policy Period" = "Introduction of Key Policies, Including Social Distancing")) +
  guides(fill = guide_legend(override.aes = list(alpha = 1))) +
  # Set the colors for the lines
  scale_colour_manual(values = c("New Deceased" = "brown1", "Number of Active Policies" = "blue4")) +
  # Set the y-axis for new_deceased and create the secondary y-axis for active_policy_count
  scale_y_continuous("New Deceased", 
                     sec.axis = sec_axis(~ . / scaling_factor, name = "Number of Active Policies"),
                     limits = c(0, 10),  # Set the upper limit to 8 as specified
                     breaks = seq(0, 10, by = 2)) +  # Set breaks to go up in steps of 2 y-axis
  # Add labels and titles
  labs(title = "Daily COVID-19 Deceased and Number of Active Policies Over Time",
       subtitle = "Data from 20/01/2020 to 30/06/2020",
       x = "Date", y = "Number of New Deceased") +
  # Apply the minimal theme
  theme_minimal() +
  # Adjust text elements
  theme(axis.title.y = element_text(color = "brown1"),
        axis.text.y = element_text(color = "brown1"),
        axis.title.y.right = element_text(color = "blue4"),
        axis.text.y.right = element_text(color = "blue4"),
        legend.position = "bottom",
        legend.title = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major.y = element_line(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), # Remove vertical gridlines
        legend.background = element_blank(), # Remove legend background
        legend.key = element_blank(),  # Remove legend key boxes
       plot.title = element_text(size = 13, face = "bold"),
       plot.subtitle = element_text(size = 11)
        ) 

# Calculate the moving average for new_deceased
time_data_cleaned$moving_average_deceased <- rollmean(time_data_cleaned$new_deceased, k = 7, fill = NA, align = 'right')

# Calculate the scaling factor for the secondary axis
#scaling_factor <- max(time_data_cleaned$moving_average_deceased, na.rm = TRUE) / #max(time_data_cleaned$active_policy_count, na.rm = TRUE)
scaling_factor <- 0.2
# Plotting the data with ggplot2
ggplot(time_data_cleaned, aes(x = date)) +
  geom_rect(data = subset(time_data_cleaned, !is.na(highlight)), 
            aes(xmin = as.Date("2020-02-23"), xmax = as.Date("2020-02-29"), 
                ymin = -Inf, ymax = Inf, fill = highlight), alpha = 0.1) +
  geom_line(aes(y = moving_average_deceased, colour = "Moving Average New Deceased"),size=0.95) +
  geom_line(aes(y = active_policy_count * scaling_factor, colour = "Number of Active Policies"),size=0.95) +
  scale_fill_manual(name = "Highlight", 
                    values = c("Key Policy Period" = "yellow"), 
                    labels = c("Key Policy Period" = "Introduction of Key Policies")) +
  guides(fill = guide_legend(override.aes = list(alpha = 0.7))) +
  scale_colour_manual("", 
                      breaks = c("Moving Average New Deceased", "Number of Active Policies"),
                      values = c("Moving Average New Deceased" = "brown1", "Number of Active Policies" = "blue4")) +
  scale_y_continuous("New Deceased", 
                     sec.axis = sec_axis(~ . / scaling_factor, name = "Number of Active Policies"),
                     limits = c(0, 10),  # Set the upper limit to 8 as specified
                     breaks = seq(0, 10, by = 2)) +  # Set breaks to go up in steps of 2

  labs(title = "Daily COVID-19 Deceased and Number of Active Policies Over Time",
       subtitle = "Data from 20/01/2020 to 30/06/2020",
       x = "Date", y = "Number of New Deceased (7-day Moving Average)") +
  theme_minimal() +
  theme(
    axis.title.y = element_text(color = "brown1"),
    axis.text.y = element_text(color = "brown1"),
    axis.title.y.right = element_text(color = "blue4"),
    axis.text.y.right = element_text(color = "blue4"),
    legend.position = "bottom",
    legend.title = element_blank(),
    panel.grid.major.y = element_line(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  ) 

5 Mortality And Infection Rate

merge <- merge(SearchTrend, Time, by = "date")
glimpse(merge)
## Rows: 162
## Columns: 11
## $ date        <date> 2020-01-20, 2020-01-21, 2020-01-22, 2020-01-23, 2020-01-2…
## $ cold        <dbl> 0.19217, 0.22462, 0.23808, 0.30308, 0.34689, 0.70888, 0.96…
## $ flu         <dbl> 0.70343, 0.59789, 0.56661, 0.55625, 0.40226, 0.39744, 0.40…
## $ pneumonia   <dbl> 3.63716, 4.31987, 3.66416, 3.18035, 2.48156, 3.40926, 3.43…
## $ coronavirus <dbl> 20.69610, 35.33284, 29.74474, 100.00000, 86.11541, 62.8484…
## $ time        <dbl> 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16…
## $ test        <dbl> 1, 1, 4, 22, 27, 27, 51, 61, 116, 187, 246, 312, 371, 429,…
## $ negative    <dbl> 0, 0, 3, 21, 25, 25, 47, 56, 97, 155, 199, 245, 289, 327, …
## $ confirmed   <dbl> 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 6, 11, 12, 15, 15, 16, 18, 2…
## $ released    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2…
## $ deceased    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
# Calculate new Cases, new tests, and new deceased
Time_cleaned <- Time %>%
  mutate(
    new_tests = ifelse(row_number() == 1, test, test - lag(test)),
    new_confirmed = ifelse(row_number() == 1, confirmed, confirmed - lag(confirmed)),
    new_deceased = ifelse(row_number() == 1, deceased, deceased - lag(deceased)),
    date = as.Date(date, format="%Y-%m-%d")  # Ensure date is in Date format
  )
merge <- merge(SearchTrend, Time_cleaned, by = "date")
glimpse(merge)
## Rows: 162
## Columns: 14
## $ date          <date> 2020-01-20, 2020-01-21, 2020-01-22, 2020-01-23, 2020-01…
## $ cold          <dbl> 0.19217, 0.22462, 0.23808, 0.30308, 0.34689, 0.70888, 0.…
## $ flu           <dbl> 0.70343, 0.59789, 0.56661, 0.55625, 0.40226, 0.39744, 0.…
## $ pneumonia     <dbl> 3.63716, 4.31987, 3.66416, 3.18035, 2.48156, 3.40926, 3.…
## $ coronavirus   <dbl> 20.69610, 35.33284, 29.74474, 100.00000, 86.11541, 62.84…
## $ time          <dbl> 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, …
## $ test          <dbl> 1, 1, 4, 22, 27, 27, 51, 61, 116, 187, 246, 312, 371, 42…
## $ negative      <dbl> 0, 0, 3, 21, 25, 25, 47, 56, 97, 155, 199, 245, 289, 327…
## $ confirmed     <dbl> 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 6, 11, 12, 15, 15, 16, 18,…
## $ released      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2,…
## $ deceased      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ new_tests     <dbl> 1, 0, 3, 18, 5, 0, 24, 10, 55, 71, 59, 66, 59, 58, 61, 1…
## $ new_confirmed <dbl> 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 2, 5, 1, 3, 0, 1, 2, 5, 1,…
## $ new_deceased  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
merge <- transform(merge,
                   mortality_rate = new_deceased / new_confirmed,
                   infection_rate = new_confirmed / new_tests)

# Convert date to a date format
merge$date <- as.Date(merge$date)
Policy$start_date <- as.Date(Policy$start_date)

# Create a graph of the mortality rate over Time with the x-axis showing months

# Plot 1: Mortality Rate Over Time

# Plotting
a <- ggplot(data = merge, aes(x = date)) +
  geom_line(aes(y = mortality_rate, color = "Mortality Rate"), size = 0.75) +
  geom_line(aes(y = infection_rate, color = "Infection Rate"), size = 0.75) +
  scale_x_date(date_labels = "%b", date_breaks = "1 month") +
  labs(title = "Increased Mortality Amidst Fewer Infections Highlights Critical Challenges",
       subtitle = "Mortality and Infection Rates Over Time from 20/01/2020 to 30/06/2020",
       y = "Rate") +
  xlab(NULL)+ 
  scale_color_manual(values = c("Mortality Rate" = "blue4", "Infection Rate" = "brown1")) +
  theme_minimal() +
  ylim(0, 0.7)  + # Adjust the y-axis limits as needed
  theme(
    panel.grid.major = element_blank(), 
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  )

# Display the plot
print(a)
## Warning: Removed 2 rows containing missing values (`geom_line()`).

# Plot 2: COVID-19 Metrics Over Time
merge2 <- left_join(merge, Policy, by = c("date" = "start_date"))

# Plot 2: COVID-19 Metrics Over Time
b <- ggplot(data = merge2, aes(x = date)) +
  geom_line(aes(y = new_tests, color = "Tests"), size = 0.75) +
  geom_line(aes(y = new_confirmed, color = "Infected"), size = 0.75) +
  geom_line(aes(y = new_deceased, color = "Deceased"), size = 0.75) +
  scale_y_log10(labels = scales::comma_format()) +
  labs(subtitle = "Tested, Infected, and Deceased Over Time from 20/01/2020 to 30/06/2020",
       y = "Volume") +
  xlab(NULL)+ 
  scale_color_manual(values = c("Tests" = "blue4", "Infected" = "brown1", "Deceased" = "chartreuse3")) +
  theme_minimal() + 
  theme(
    panel.grid.major = element_blank(), 
    panel.background = element_blank(),
    plot.title = element_text(size = 13, face = "bold"),
    plot.subtitle = element_text(size = 11)
  )

# Display the plots
print(a)
## Warning: Removed 2 rows containing missing values (`geom_line()`).

print(b)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

# Assuming a and b are the two ggplot objects
combined_plot <- plot_grid(a, b, ncol = 1, align = "v")
## Warning: Removed 2 rows containing missing values (`geom_line()`).
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
combined_plot